Differential expression analysis of LCM RNA Data

Managing Packages Using Renv

To run this code in my project using the renv environment, run the following lines of code

install.packages("renv") #install the package on the new computer (may not be necessary if renv bootstraps itself as expected)
renv::restore() #reinstall all the package versions in the renv lockfile

Load packages

require("genefilter")
## Loading required package: genefilter
require("DESeq2")
## Loading required package: DESeq2
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## Warning: package 'matrixStats' was built under R version 4.3.3
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following objects are masked from 'package:genefilter':
## 
##     rowSds, rowVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
require("apeglm")
## Loading required package: apeglm
require("ashr")
## Loading required package: ashr
require("ggplot2")
## Loading required package: ggplot2
require("vsn")
## Loading required package: vsn
require("hexbin")
## Loading required package: hexbin
## Warning: package 'hexbin' was built under R version 4.3.3
require("pheatmap")
## Loading required package: pheatmap
require("RColorBrewer")
## Loading required package: RColorBrewer
require("EnhancedVolcano")
## Loading required package: EnhancedVolcano
## Loading required package: ggrepel
## Warning: package 'ggrepel' was built under R version 4.3.3
require("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ dplyr::collapse()     masks IRanges::collapse()
## ✖ dplyr::combine()      masks Biobase::combine(), BiocGenerics::combine()
## ✖ dplyr::count()        masks matrixStats::count()
## ✖ dplyr::desc()         masks IRanges::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ dplyr::first()        masks S4Vectors::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ dplyr::rename()       masks S4Vectors::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::slice()        masks IRanges::slice()
## ✖ readr::spec()         masks genefilter::spec()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sessionInfo() #provides list of loaded packages and version of R.
## R version 4.3.2 (2023-10-31)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.0
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices datasets  utils     methods  
## [8] base     
## 
## other attached packages:
##  [1] lubridate_1.9.3             forcats_1.0.0              
##  [3] stringr_1.5.1               dplyr_1.1.4                
##  [5] purrr_1.0.2                 readr_2.1.5                
##  [7] tidyr_1.3.1                 tibble_3.2.1               
##  [9] tidyverse_2.0.0             EnhancedVolcano_1.18.0     
## [11] ggrepel_0.9.6               RColorBrewer_1.1-3         
## [13] pheatmap_1.0.12             hexbin_1.28.5              
## [15] vsn_3.68.0                  ggplot2_3.5.1              
## [17] ashr_2.2-63                 apeglm_1.22.1              
## [19] DESeq2_1.40.2               SummarizedExperiment_1.30.2
## [21] Biobase_2.60.0              MatrixGenerics_1.12.3      
## [23] matrixStats_1.4.1           GenomicRanges_1.54.1       
## [25] GenomeInfoDb_1.36.4         IRanges_2.34.1             
## [27] S4Vectors_0.38.2            BiocGenerics_0.46.0        
## [29] genefilter_1.82.1          
## 
## loaded via a namespace (and not attached):
##  [1] DBI_1.2.3               bitops_1.0-9            rlang_1.1.4            
##  [4] magrittr_2.0.3          compiler_4.3.2          RSQLite_2.3.9          
##  [7] png_0.1-8               vctrs_0.6.5             pkgconfig_2.0.3        
## [10] crayon_1.5.3            fastmap_1.2.0           XVector_0.40.0         
## [13] utf8_1.2.4              rmarkdown_2.28          tzdb_0.4.0             
## [16] preprocessCore_1.62.1   bit_4.5.0               xfun_0.48              
## [19] zlibbioc_1.46.0         cachem_1.1.0            jsonlite_1.8.9         
## [22] blob_1.2.4              DelayedArray_0.26.7     BiocParallel_1.34.2    
## [25] irlba_2.3.5.1           parallel_4.3.2          R6_2.5.1               
## [28] stringi_1.8.4           bslib_0.8.0             limma_3.56.2           
## [31] SQUAREM_2021.1          jquerylib_0.1.4         numDeriv_2016.8-1.1    
## [34] Rcpp_1.0.13-1           knitr_1.48              timechange_0.3.0       
## [37] Matrix_1.6-5            splines_4.3.2           tidyselect_1.2.1       
## [40] rstudioapi_0.17.0       abind_1.4-8             yaml_2.3.10            
## [43] codetools_0.2-20        affy_1.78.2             lattice_0.22-6         
## [46] plyr_1.8.9              withr_3.0.1             KEGGREST_1.40.1        
## [49] coda_0.19-4.1           evaluate_1.0.1          survival_3.7-0         
## [52] Biostrings_2.70.3       pillar_1.9.0            affyio_1.70.0          
## [55] BiocManager_1.30.25     renv_1.0.11             generics_0.1.3         
## [58] invgamma_1.1            RCurl_1.98-1.16         truncnorm_1.0-9        
## [61] emdbook_1.3.13          hms_1.1.3               munsell_0.5.1          
## [64] scales_1.3.0            xtable_1.8-4            glue_1.8.0             
## [67] tools_4.3.2             annotate_1.78.0         locfit_1.5-9.10        
## [70] mvtnorm_1.3-2           XML_3.99-0.17           grid_4.3.2             
## [73] bbmle_1.0.25.1          bdsmatrix_1.3-7         AnnotationDbi_1.64.1   
## [76] colorspace_2.1-1        GenomeInfoDbData_1.2.10 cli_3.6.3              
## [79] fansi_1.0.6             mixsqp_0.3-54           S4Arrays_1.0.6         
## [82] gtable_0.3.5            sass_0.4.9              digest_0.6.37          
## [85] memoise_2.0.1           htmltools_0.5.8.1       lifecycle_1.0.4        
## [88] httr_1.4.7              bit64_4.5.2             MASS_7.3-60.0.1
save_ggplot <- function(plot, filename, width = 10, height = 7, units = "in", dpi = 300) {
  # Display plot
  print(plot)
  
  # Save plot
  ggsave(filename = paste0(filename, ".png"), plot = plot, width = width, height = height, units = units, dpi = dpi)
}

# Specify colors
ann_colors = list(
    Tissue = c(OralEpi = "palegreen3" ,Aboral = "mediumpurple1")
)

Read and clean count matrix and metadata

Read in raw count data

counts_raw <- read.csv("../output_RNA/stringtie-GeneExt/LCM_RNA_gene_count_matrix.csv", row.names = 1) #load in data

gene_id,LCM_15,LCM_16,LCM_20,LCM_21,LCM_26,LCM_27,LCM_4,LCM_5,LCM_8,LCM_9

Read in metadata

meta <- read.csv("../data_RNA/LCM_RNA_metadata.csv") %>%
            dplyr::arrange(Sample) %>%
            mutate(across(c(Tissue, Fragment, Section_Date, LCM_Date), factor)) # Set variables as factors 

meta$Tissue <- factor(meta$Tissue, levels = c("OralEpi","Aboral")) #we want OralEpi to be the baseline

Data sanity checks!

all(meta$Sample %in% colnames(counts_raw)) #are all of the sample names in the metadata column names in the gene count matrix? Should be TRUE
## [1] TRUE
all(meta$Sample == colnames(counts_raw)) #are they the same in the same order? Should be TRUE
## [1] TRUE

pOverA filtering to reduce dataset

ffun<-filterfun(pOverA(0.5,10))  #Keep genes expressed in at least 50% of samples -
counts_filt_poa <- genefilter((counts_raw), ffun) #apply filter

filtered_counts <- counts_raw[counts_filt_poa,] #keep only rows that passed filter

cat("Number of genes after filtering:", sum(counts_filt_poa))
## Number of genes after filtering: 14464
write.csv(filtered_counts, "../output_RNA/differential_expression/filtered_counts.csv")

There are now 14464 genes in the filtered dataset.

Data sanity checks:

all(meta$Sample %in% colnames(filtered_counts)) #are all of the sample names in the metadata column names in the gene count matrix?
## [1] TRUE
all(meta$Sample == colnames(filtered_counts))  #are they the same in the same order? Should be TRUE
## [1] TRUE

DESeq2

Create DESeq object and run DESeq2

dds <- DESeqDataSetFromMatrix(countData = filtered_counts,
                              colData = meta,
                              design= ~ Fragment + Tissue)

dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing

### Extract results for Aboral vs. OralEpi contrast

res <- results(dds, contrast = c("Tissue","Aboral","OralEpi"))
resLFC <- lfcShrink(dds, coef="Tissue_Aboral_vs_OralEpi", res=res, type = "apeglm")
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
##     Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
##     sequence count data: removing the noise and preserving large differences.
##     Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

Extract results for adjusted p-value < 0.05

res <- resLFC

resOrdered <- res[order(res$pvalue),]# save differentially expressed genes

DE_05 <- as.data.frame(resOrdered) %>% filter(padj < 0.05)
DE_05_Up <- DE_05 %>% filter(log2FoldChange > 0) #Higher in Aboral, Lower in OralEpi
DE_05_Down <- DE_05 %>% filter(log2FoldChange < 0) #Lower in Aboral, Higher in OralEpi

nrow(DE_05)
## [1] 3606
nrow(DE_05_Up) #Higher in Aboral, Lower in OralEpi
## [1] 804
nrow(DE_05_Down) #Lower in Aboral, Higher in OralEpi
## [1] 2802
write.csv(as.data.frame(resOrdered), 
          file="../output_RNA/differential_expression/DESeq_results.csv")

write.csv(DE_05, 
          file="../output_RNA/differential_expression/DEG_05.csv")

Visualizing Differential Expression

EnhancedVolcano(resLFC, 
    lab = rownames(resLFC),
    x = 'log2FoldChange',
    y = 'pvalue')

Plots

plotMA(results(dds, contrast = c("Tissue","Aboral","OralEpi")), ylim=c(-20,20))

plotMA(resLFC, ylim=c(-20,20))

Log2 Fold Change Comparison

# because we are interested in the comparison and not the intercept, we set 'coef=2'
resNorm <- lfcShrink(dds, coef=6, type="normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## 
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
resAsh <- lfcShrink(dds, coef=6, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
##     Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
##     https://doi.org/10.1093/biostatistics/kxw041
par(mfrow=c(1,3), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-20,20)
plotMA(resLFC, xlim=xlim, ylim=ylim, main="apeglm")
plotMA(resNorm, xlim=xlim, ylim=ylim, main="normal")
plotMA(resAsh, xlim=xlim, ylim=ylim, main="ashr")

plotCounts(dds, gene=which.max(res$log2FoldChange), intgroup="Tissue")

plotCounts(dds, gene=which.min(res$log2FoldChange), intgroup="Tissue")

Transforming count data for visualization

vsd <- vst(dds, blind=FALSE)
rld <- rlog(dds, blind=FALSE)
ntd <- normTransform(dds) # this gives log2(n + 1)

meanSdPlot(assay(vsd), main = "vsd")
## Warning in geom_hex(bins = bins, ...): Ignoring unknown parameters: `main`

meanSdPlot(assay(rld))

meanSdPlot(assay(ntd))

#save the vsd transformation
vsd_mat <- assay(vsd)
write.csv(vsd_mat, file = "../output_RNA/differential_expression/vsd_expression_matrix.csv")

Will move forward with vst transformation for visualizations

Heatmap of count matrix

df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view all genes
pheatmap(assay(vsd), cluster_rows=TRUE, show_rownames=FALSE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)),
         annotation_colors = ann_colors,color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

#view highest count genes
select <- order(rowMeans(counts(dds,normalized=TRUE)),
                decreasing=TRUE)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)),
         annotation_colors = ann_colors, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

#view most significantly differentially expressed genes

select <- order(res$padj)[1:20]

pheatmap(assay(vsd)[select,], cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2, annotation_col=(df%>% select(Tissue)),
         annotation_colors = ann_colors,color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200))

Heatmap of the sample-to-sample distances

sampleDists <- dist(t(assay(vsd)))

sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- paste(vsd$Tissue, vsd$Fragment, sep="-")
colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
         clustering_distance_rows=sampleDists,
         clustering_distance_cols=sampleDists,
         col=colors)

Principal component plot of the samples

pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE, ntop = 14464)

percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "../output_RNA/differential_expression/PCA_allgenes")

PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

ggsave(filename = paste0("../output_RNA/differential_expression/PCA_allgenes_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)
pcaData <- plotPCA(vsd, intgroup=c("Tissue", "Fragment"), returnData=TRUE)

percentVar <- round(100 * attr(pcaData, "percentVar"))
PCA <- ggplot(pcaData, aes(PC1, PC2, color=Tissue, shape=Fragment)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

save_ggplot(PCA, "../output_RNA/differential_expression/PCA")

PCA_small <- ggplot(pcaData, aes(PC1, PC2, color=Tissue)) +
  geom_point(size=2) +
  scale_color_manual(values = c("Aboral" = "mediumpurple1", "OralEpi" = "palegreen3"))+
  xlab(paste0("PC1: ",percentVar[1],"% variance")) +
  ylab(paste0("PC2: ",percentVar[2],"% variance")) + 
  coord_fixed() + theme_bw()

ggsave(filename = paste0("../output_RNA/differential_expression/PCA_small", ".png"), plot = PCA_small, width = 4, height = 2.5, units = "in", dpi = 300)

Clearly, the majority of the variance in the data is explained by tissue type!

Annotation data

Download annotation files from genome website


# wget files
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.EggNog_results.txt.gz

wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.KEGG_results.txt.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz
EggNog <- read.delim("../references/Pocillopora_acuta_HIv2.genes.EggNog_results.txt") %>% dplyr::rename("query" = X.query)

CDSearch <- read.delim("../references/Pocillopora_acuta_HIv2.genes.Conserved_Domain_Search_results.txt", quote = "") %>% dplyr::rename("query" = X.Query)

KEGG <- read.delim("../references/Pocillopora_acuta_HIv2.genes.KEGG_results.txt", header = FALSE) %>% dplyr::rename("query" = V1, "KeggTerm" = V2)
DE_05$query <- rownames(DE_05)
DE_05_annot <- DE_05 %>% left_join(CDSearch) %>% select(query,everything())
## Joining with `by = join_by(query)`
DE_05_eggnog <- DE_05 %>% left_join(EggNog) %>% select(query,everything())
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_eggnog), 
          file="../output_RNA/differential_expression/DE_05_eggnog_annotation.csv")

annot_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(CDSearch)
## Joining with `by = join_by(query)`
eggnog_all <- as.data.frame(rownames(dds)) %>% dplyr::rename("query" = `rownames(dds)`) %>% left_join(EggNog)
## Joining with `by = join_by(query)`
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])
gene_labels <- eggnog_all %>% select(query,PFAMs) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) %>% #replace NAs with "" for labelling purposes
  separate(PFAMs, into = c("PFAMs", "rest of name"), sep = ",(?=.*?,)", extra = "merge")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 12723 rows [1, 2, 3, 4,
## 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 19, 20, 21, 23, 24, 25, ...].
#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[select,"PFAMs"], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi")

Genes of Interest

MarkerGenes <- read.csv("../references/Pacuta_MarkerGenes_Levy2021.csv") %>% dplyr::rename("query" = 1, "List" = 2, "definition" = 3) %>% filter(List !="Toolkit")

MarkerGenes_broc <- read.csv("../output_RNA/marker_genes/Pacuta_Spis_Markers_pairs.csv") %>% select(protein_id_spB,cluster,Standardized_Name_spA ) %>% dplyr::rename("query" = 1, "List" = 2)

MarkerGenes$def_short <- ifelse(nchar(MarkerGenes$definition) > 20, 
                            paste0(substr(MarkerGenes$definition, 1, 17), "..."), 
                            MarkerGenes$definition)
HoxGenes_Nvec <- read.csv("../output_RNA/marker_genes/Hox_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))

HoxGenes_Nvec$def_short <- gsub("Homeobox protein", "Hox", HoxGenes_Nvec$Description)
HoxGenes_Nvec$def_short <- gsub("homeobox protein", "Hox", HoxGenes_Nvec$def_short)

He_etal_Nvec <- read.csv("../output_RNA/marker_genes/He_etal_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))

He_etal_Nvec$def_short <- gsub("Homeobox protein", "Hox", He_etal_Nvec$Description)
He_etal_Nvec$def_short <- gsub("homeobox protein", "Hox", He_etal_Nvec$def_short)

DuBuc_etal_Nvec <- read.csv("../output_RNA/marker_genes/Wnt_nematostella.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X))
Biomin <- read.csv("../output_RNA/marker_genes/Pacuta_Biomin.csv") %>% dplyr::rename("query" = Pocillopora_acuta_best_hit) %>% select(-c(accessionnumber.geneID, Ref))
Biomin_broc <- read.csv("../output_RNA/marker_genes/Pacuta_Biomin_Spis_ortholog.csv") %>% dplyr::rename("query" = Pacuta_gene) %>% select(-c(X,accessionnumber_gene_id, ref))

Biomin <- Biomin %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","))
## `summarise()` has grouped output by 'query'. You can override using the
## `.groups` argument.
Biomin$def_short <- ifelse(nchar(Biomin$definition) > 40, 
                            paste0(substr(Biomin$definition, 1, 37), "..."), 
                            Biomin$definition)

Biomin_filtered_counts <- filtered_counts[(rownames(filtered_counts) %in% Biomin$query),]

Biomin_broc <- Biomin_broc %>%
  group_by(query,List) %>%
  summarize(definition = paste(unique(definition), collapse = ","))
## `summarise()` has grouped output by 'query'. You can override using the
## `.groups` argument.
Biomin_broc$def_short <- ifelse(nchar(Biomin_broc$definition) > 40, 
                            paste0(substr(Biomin_broc$definition, 1, 37), "..."), 
                            Biomin_broc$definition)

Biomin_broc_filtered_counts <- filtered_counts[(rownames(filtered_counts) %in% Biomin_broc$query),]

write.csv(Biomin_filtered_counts, "../output_RNA/differential_expression/Biomin_filtered_counts.csv")
DE_05$query <- rownames(DE_05)
resOrdered$query <- rownames(resOrdered)

DE_05_biomin <- DE_05 %>% left_join(Biomin) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
DESeq_biomin <- as.data.frame(resOrdered) %>% left_join(Biomin) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_Biomin_broc <- DE_05 %>% left_join(Biomin_broc) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
DESeq_Biomin_broc <- as.data.frame(resOrdered) %>% left_join(Biomin_broc) %>% select(query,everything()) %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_marker <- DE_05 %>% left_join(MarkerGenes) %>% select(query,everything())  %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_marker_broc <- DE_05 %>% left_join(MarkerGenes_broc) %>% select(query,everything())  %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_Hox <- DE_05 %>% left_join(HoxGenes_Nvec) %>% select(query,everything())  %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_He_etal <- DE_05 %>% left_join(He_etal_Nvec) %>% select(query,everything())  %>% drop_na()
## Joining with `by = join_by(query)`
DE_05_DuBuc_etal <- DE_05 %>% left_join(DuBuc_etal_Nvec) %>% select(query,everything())  %>% drop_na()
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_biomin), 
          file="../output_RNA/differential_expression/DE_05_biomin_annotation.csv")

write.csv(as.data.frame(DE_05_marker), 
          file="../output_RNA/differential_expression/DE_05_markergene_annotation.csv")

biomin_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin) 
## Joining with `by = join_by(query)`
biomin_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(Biomin) 
## Joining with `by = join_by(query)`
Biomin_broc_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(Biomin_broc) 
## Joining with `by = join_by(query)`
Biomin_broc_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(Biomin_broc) 
## Joining with `by = join_by(query)`
markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes) 
## Joining with `by = join_by(query)`
markers_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(MarkerGenes) 
## Joining with `by = join_by(query)`
broc_markers_all_counts <- as.data.frame(counts(dds)) %>% mutate(query = rownames(dds)) %>% select(query,everything()) %>% left_join(MarkerGenes_broc) 
## Joining with `by = join_by(query)`
broc_markers_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(MarkerGenes_broc) 
## Joining with `by = join_by(query)`
Hox_all_res <- as.data.frame(resLFC) %>% mutate(query = rownames(resLFC)) %>% select(query,everything()) %>% left_join(HoxGenes_Nvec) 
## Joining with `by = join_by(query)`
Hox_all_res %>% drop_na()
##                                        query   baseMean log2FoldChange
## 1      Pocillopora_acuta_HIv2___TS.g15143.t1   521.8679      0.1422022
## 2  Pocillopora_acuta_HIv2___RNAseq.g12537.t1   271.6692      0.1068577
## 3  Pocillopora_acuta_HIv2___RNAseq.g24142.t1   677.5990     -0.2872155
## 4    Pocillopora_acuta_HIv2___RNAseq.g614.t1  2049.6550      3.4581160
## 5    Pocillopora_acuta_HIv2___RNAseq.18045_t  1074.8238      0.8607645
## 6   Pocillopora_acuta_HIv2___RNAseq.g3322.t1  1641.0298    -12.6389456
## 7      Pocillopora_acuta_HIv2___TS.g25076.t1  2172.4212      0.3659500
## 8   Pocillopora_acuta_HIv2___RNAseq.g1763.t1   844.6168      3.1839530
## 9   Pocillopora_acuta_HIv2___RNAseq.g9587.t1  1162.9301      1.5381041
## 10  Pocillopora_acuta_HIv2___RNAseq.g9588.t1 10129.8808     13.0019191
## 11 Pocillopora_acuta_HIv2___RNAseq.g26215.t1   665.0359      0.4339101
## 12 Pocillopora_acuta_HIv2___RNAseq.g26215.t1   665.0359      0.4339101
##        lfcSE       pvalue         padj Gene_Name
## 1  1.0156290 1.403026e-02 5.531033e-02      B-H1
## 2  1.0129642 1.620554e-02 6.222378e-02      B-H1
## 3  0.9977356 1.453626e-01 3.413047e-01   Anthox1
## 4  1.8621899 1.683827e-04 1.284540e-03    Six4/5
## 5  1.4145958 5.029165e-03 2.338214e-02  Nkx2.2a1
## 6  3.2203985 3.690243e-11 2.629343e-09       Dlx
## 7  0.6635312 4.480396e-01 6.901301e-01  Anthox6a
## 8  1.7839837 2.174546e-04 1.603908e-03      Alx4
## 9  1.6584118 1.648080e-02 6.316330e-02      OtxC
## 10 2.9159022 2.001347e-17 1.929832e-14      OtxB
## 11 1.1064758 5.857990e-02 1.759343e-01  Anthox8a
## 12 1.1064758 5.857990e-02 1.759343e-01  Anthox8b
##                           Description             def_short
## 1               Homeobox protein B-H1              Hox B-H1
## 2               Homeobox protein B-H1              Hox B-H1
## 3            Homeobox protein Anthox1           Hox Anthox1
## 4             Homeobox protein SIX4/5            Hox SIX4/5
## 5           Homeobox protein Nkx-2.2a          Hox Nkx-2.2a
## 6                Homeobox protein Dlx               Hox Dlx
## 7           Homeobox protein Anthox6a          Hox Anthox6a
## 8  Homeobox protein aristaless-like 4 Hox aristaless-like 4
## 9               Homeobox protein OTX1              Hox OTX1
## 10            Homeobox protein OTX1 B            Hox OTX1 B
## 11          Homeobox protein Anthox8a          Hox Anthox8a
## 12          Homeobox protein Anthox8b          Hox Anthox8b
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

#view biomin genes that are differntially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_biomin$query, ]), center = TRUE, scale = TRUE))
DE_biomin <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_biomin$def_short, fontsize_row = 5)
DE_biomin
save_ggplot(DE_biomin, "../output_RNA/differential_expression/DE_biomin")

#view biomin genes that are differntially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_Biomin_broc$query, ]), center = TRUE, scale = TRUE))
DE_Biomin_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_Biomin_broc$def_short, fontsize_row = 5)
DE_Biomin_broc
save_ggplot(DE_Biomin_broc, "../output_RNA/differential_expression/DE_Biomin_broc")

#view marker genes that are differntially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_marker$query, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,cutree_rows = 5,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker$List, fontsize_row = 4)
DE_marker
save_ggplot(DE_marker, "../output_RNA/differential_expression/DE_marker")

DE_05_marker_grouped <- DE_05_marker %>% arrange(List) %>% mutate(List = as.factor(List))

z_scores <- t(scale(t(assay(vsd)[DE_05_marker_grouped$quer, ]), center = TRUE, scale = TRUE))
DE_05_marker_grouped_plot <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker_grouped$List, fontsize_row = 5)

DE_05_marker_grouped_plot
save_ggplot(DE_05_marker_grouped_plot, "../output_RNA/differential_expression/DE_05_marker_grouped")

#view marker genes that are differntially expressed

z_scores <- t(scale(t(assay(vsd)[DE_05_marker_broc$query, ]), center = TRUE, scale = TRUE))
DE_marker <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,cutree_rows = 5,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker_broc$List, fontsize_row = 4)
DE_marker
save_ggplot(DE_marker, "../output_RNA/differential_expression/DE_marker_broc")

DE_05_marker_broc_grouped <- DE_05_marker_broc %>% arrange(List) %>% mutate(List = as.factor(List))

z_scores <- t(scale(t(assay(vsd)[DE_05_marker_broc_grouped$quer, ]), center = TRUE, scale = TRUE))
DE_05_marker_broc_grouped_plot <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = DE_05_marker_broc_grouped$List, fontsize_row = 5)

DE_05_marker_broc_grouped_plot
save_ggplot(DE_05_marker_broc_grouped_plot, "../output_RNA/differential_expression/DE_05_marker_broc_grouped")

Visualizing Differential Expression

Biomin_volcano <- EnhancedVolcano(biomin_all_res, 
    lab = biomin_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 40)

save_ggplot(Biomin_volcano, "../output_RNA/differential_expression/Biomin_volcano")

Biomin_broc_volcano <- EnhancedVolcano(Biomin_broc_all_res, 
    lab = Biomin_broc_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 40)

save_ggplot(Biomin_broc_volcano, "../output_RNA/differential_expression/Biomin_broc_volcano")

Marker_volcano <- EnhancedVolcano(markers_all_res, 
    lab = markers_all_res$List,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(Marker_volcano, "../output_RNA/differential_expression/Marker_volcano")

Marker_volcano_names <- EnhancedVolcano(markers_all_res, 
    lab = markers_all_res$def_short,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(Marker_volcano_names, "../output_RNA/differential_expression/Marker_volcano_names")

Marker_volcano <- EnhancedVolcano(broc_markers_all_res, 
    lab = broc_markers_all_res$List,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.01,
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(Marker_volcano, "../output_RNA/differential_expression/Marker_volcano_broc")

EnhancedVolcano(resLFC, 
    lab = rownames(resLFC),
    x = 'log2FoldChange',
    y = 'pvalue')

volcano_plain <- EnhancedVolcano(resLFC, 
    lab = NA,
    x = 'log2FoldChange',
    y = 'padj',
    pCutoff = 0.05,
    title="",
    subtitle="",
    drawConnectors = TRUE,
    widthConnectors = 0.75,
    pointSize = 1,
    labSize = 2,boxedLabels = TRUE,max.overlaps = 60)

save_ggplot(volcano_plain, "../output_RNA/differential_expression/volcano_plain",width = 4, height = 6, units = "in", dpi = 300)

save_ggplot(volcano_plain, "../output_RNA/differential_expression/volcano_plain")

Trying to annotate un-annotated DE genes:

# wget protein sequence reference file
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz

# move to references direcotry
mv *gz ../references

# unzip files
gunzip ../references/*gz

#get the names of all the DEGs from the first column of the DEG csv file
tail -n +2 ../output_RNA/differential_expression/DEG_05.csv | cut -d',' -f1 | tr -d '"' > ../output_RNA/differential_expression/DEG_05_names.csv

#grep this file against the protein fasta file, first with wc -l to make sure the number of lines is correct (should be your number of DEGs)
grep -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa | wc -l

#grep each header with the protein sequence after ("-A 1") and save to a new file
grep  -A 1 -f ../output_RNA/differential_expression/DEG_05_names.csv ../references/Pocillopora_acuta_HIv2.genes.pep.faa > ../output_RNA/differential_expression/DEG_05_seqs.txt

nr database BLAST of DE_05 genes only

On andromeda:

Blastp-ing only the DE genes against the entire nr database (will take a while)

cd ../scripts
nano DEG_05_blast.sh
#!/bin/bash
#SBATCH --job-name="DE_blast"
#SBATCH -t 240:00:00
#SBATCH --export=NONE
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH --mail-user=zdellaert@uri.edu #your email to send notifications
#SBATCH --mem=500GB
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --account=putnamlab
#SBATCH --nodes=2 --ntasks-per-node=24

module load BLAST+/2.15.0-gompi-2023a

cd ../output_RNA/differential_expression #set working directory
mkdir blast
cd blast

#nr database location andromeda: /data/shared/ncbi-db/.ncbirc 
# points to current location: cat /data/shared/ncbi-db/.ncbirc
# [BLAST]
# BLASTDB=/data/shared/ncbi-db/2024-11-10

blastp -query ../DEG_05_seqs.txt -db nr -out DEG_05_blast_results.txt -outfmt 0 -evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 10

blastp -query ../DEG_05_seqs.txt -db nr -out DEG_05_blast_results_tab.txt -outfmt 6 -evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1

echo "Blast complete!" $(date)
sbatch DEG_05_blast.sh
cd ../output_RNA/differential_expression/blast

wc -l DEG_05_blast_results_tab.txt #3537 of the 3606 genes were annotated

#get just the NCBI database accession numbers for the blast results
cut -f2 DEG_05_blast_results_tab.txt > DEG_05_blast_accessions.txt

#remove any duplicates
sort -u  DEG_05_blast_accessions.txt > unique_DEG_05_blast_accessions.txt

wc -l unique_DEG_05_blast_accessions.txt #3404 of the 3537 annotations were unique

while read acc; do
  curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=$acc&rettype=gp&retmode=text" \
  | grep "DEFINITION" | sed 's/DEFINITION  //g' | awk -v id="$acc" '{print id "\t" $0}'
done < unique_DEG_05_blast_accessions.txt > DEG_05_blast_names.txt

wc -l DEG_05_blast_names.txt #3396 ; unsure why 8 are missing.

join -1 2 -2 1 -t $'\t' <(sort -k2 DEG_05_blast_results_tab.txt) <(sort DEG_05_blast_names.txt) > annotated_DEG_05_blast_results_tab.txt

SwissProt of ALL genes

On unity:

swissprot based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd

Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/

mkdir ../references/blast_dbs
cd ../references/blast_dbs
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2024_10_02.fasta.gz
gunzip -k uniprot_sprot_r2024_10_02.fasta.gz
rm uniprot_sprot_r2024_10_02.fasta.gz

head uniprot_sprot_r2024_10_02.fasta
echo "Number of Sequences"
grep -c ">" uniprot_sprot_r2024_10_02.fasta
# 572214 sequences
module load blast-plus/2.14.1

makeblastdb \
-in ../references/blast_dbs/uniprot_sprot_r2024_10_02.fasta \
-dbtype prot \
-out ../references/blast_dbs/uniprot_sprot_r2024_10_02
cd ../scripts
nano blastp_SwissProt.sh
#!/bin/bash
#SBATCH -t 18:00:00
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=48
#SBATCH --mem=500GB
#SBATCH --export=NONE
#SBATCH --error=../scripts/outs_errs/"%x_error.%j" #write out slurm error reports
#SBATCH --output=../scripts/outs_errs/"%x_output.%j" #write out any program outpus
#SBATCH --mail-type=BEGIN,END,FAIL #email you when job starts, stops and/or fails
#SBATCH -D /project/pi_hputnam_uri_edu/zdellaert/LaserCoral #set working directory

module load blast-plus/2.14.1

cd references/
mkdir annotation

fasta="Pocillopora_acuta_HIv2.genes.pep.faa"

blastp \
-query $fasta \
-db blast_dbs/uniprot_sprot_r2024_10_02 \
-out annotation/blastp_SwissProt_out.tab \
-evalue 1E-05 \
-num_threads 48 \
-max_target_seqs 1 \
-max_hsps 1 \
-outfmt 6

echo "Blast complete!" $(date)
cd references/annotation/

tr '|' '\t' <  blastp_SwissProt_out.tab >  blastp_SwissProt_out_sep.tab
cd ../references/annotation/

curl -H "Accept: text/plain; format=tsv" "https://rest.uniprot.org/uniprotkb/stream?fields=accession%2Creviewed%2Cid%2Cprotein_name%2Cgene_names%2Corganism_name%2Clength%2Cgo_p%2Cgo%2Cgo_id%2Cgo_c%2Cgo_f&format=tsv&query=%28reviewed%3Atrue%29" -o SwissProt-Annot-GO_111524.tsv

wc -l SwissProt-Annot-GO_111524.tsv
#572215

All code below based on https://github.com/urol-e5/deep-dive/blob/main/D-Apul/code/20-Apul-gene-annotation.Rmd and https://github.com/urol-e5/deep-dive/blob/main/F-Pmea/code/20-Pmea-gene-annotation.Rmd

Steven’s notebook post here: https://sr320.github.io/tumbling-oysters/posts/sr320-27-go/

bltabl <- read.csv("../references/annotation/blastp_SwissProt_out_sep.tab", sep = '\t', header = FALSE)

spgo <- read.csv("../references/annotation/SwissProt-Annot-GO_111524.tsv", sep = '\t', header = TRUE)
annot_tab <- left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
  select(
    query = V1,
    blast_hit = V3,
    evalue = V13,
    ProteinNames = Protein.names,
    BiologicalProcess = Gene.Ontology..biological.process.,
    GeneOntologyIDs = Gene.Ontology.IDs
  )

head(annot_tab)
##                                        query blast_hit    evalue
## 1 Pocillopora_acuta_HIv2___RNAseq.g24143.t1a    Q4JAI4  1.02e-37
## 2  Pocillopora_acuta_HIv2___RNAseq.g18333.t1    O08807 9.62e-116
## 3   Pocillopora_acuta_HIv2___RNAseq.g7985.t1    O74212 3.56e-158
## 4      Pocillopora_acuta_HIv2___TS.g15308.t1    Q09575  1.08e-12
## 5   Pocillopora_acuta_HIv2___RNAseq.g2057.t1    P0C1P0  8.81e-14
## 6   Pocillopora_acuta_HIv2___RNAseq.g4696.t1    Q9W2Q5  8.98e-69
##                                                                                                                                                                                                     ProteinNames
## 1                                                                                                                                              Methionine synthase (EC 2.1.1.-) (Homocysteine methyltransferase)
## 2 Peroxiredoxin-4 (EC 1.11.1.24) (Antioxidant enzyme AOE372) (Peroxiredoxin IV) (Prx-IV) (Thioredoxin peroxidase AO372) (Thioredoxin-dependent peroxide reductase A0372) (Thioredoxin-dependent peroxiredoxin 4)
## 3                                                                                                   Acyl-lipid (8-3)-desaturase (EC 1.14.19.30) (Delta(5) fatty acid desaturase) (Delta-5 fatty acid desaturase)
## 4                                                                                                                                                                                Uncharacterized protein K02A2.6
## 5                                                                              Phosphatidylinositol N-acetylglucosaminyltransferase subunit Y (Phosphatidylinositol-glycan biosynthesis class Y protein) (PIG-Y)
## 6                                                                                                                                                                   Calcium and integrin-binding family member 2
##                                                                                                                                                                                                                                                                                                                                                                                                                   BiologicalProcess
## 1                                                                                                                                                                                                                                                                                                                                                            methionine biosynthetic process [GO:0009086]; methylation [GO:0032259]
## 2 cell redox homeostasis [GO:0045454]; extracellular matrix organization [GO:0030198]; hydrogen peroxide catabolic process [GO:0042744]; male gonad development [GO:0008584]; negative regulation of male germ cell proliferation [GO:2000255]; protein maturation by protein folding [GO:0022417]; reactive oxygen species metabolic process [GO:0072593]; response to oxidative stress [GO:0006979]; spermatogenesis [GO:0007283]
## 3                                                                                                                                                                                                                                                                                                                 long-chain fatty acid biosynthetic process [GO:0042759]; unsaturated fatty acid biosynthetic process [GO:0006636]
## 4                                                                                                                                                                                                                                                                                                                                                                                                      DNA integration [GO:0015074]
## 5                                                                                                                                                                                                                                                                                                                                                                                      GPI anchor biosynthetic process [GO:0006506]
## 6                                                                                                                                                                                                                                                                                                                                                              calcium ion homeostasis [GO:0055074]; phototransduction [GO:0007602]
##                                                                                                                                                                                                          GeneOntologyIDs
## 1                                                                                                                                                                         GO:0003871; GO:0008270; GO:0009086; GO:0032259
## 2 GO:0005615; GO:0005737; GO:0005739; GO:0005783; GO:0005790; GO:0005829; GO:0006979; GO:0007283; GO:0008379; GO:0008584; GO:0022417; GO:0030198; GO:0042744; GO:0042802; GO:0045454; GO:0072593; GO:0140313; GO:2000255
## 3                                                                                                                                                 GO:0006636; GO:0016020; GO:0020037; GO:0042759; GO:0046872; GO:0102866
## 4                                                                                                                                                 GO:0003676; GO:0005737; GO:0008270; GO:0015074; GO:0019899; GO:0042575
## 5                                                                                                                                                                                                 GO:0000506; GO:0006506
## 6                                                                                                                                                             GO:0000287; GO:0005509; GO:0005737; GO:0007602; GO:0055074
write.table(annot_tab, 
            file = "../references/annotation/protein-GO.tsv", 
            sep = "\t", 
            row.names = FALSE, 
            quote = FALSE)

DE_05_SwissProt <- DE_05 %>% left_join(annot_tab) %>% select(query,everything()) 
## Joining with `by = join_by(query)`
write.csv(as.data.frame(DE_05_SwissProt), 
          file="../output_RNA/differential_expression/DE_05_SwissProt_annotation.csv")
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 30, 
                            paste0(substr(DE_05_SwissProt$ProteinNames, 1, 27), "..."), 
                            DE_05_SwissProt$ProteinNames)

gene_labels <- DE_05_SwissProt %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g6351.t1"  
##  [2] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [5] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [10] "Pocillopora_acuta_HIv2___RNAseq.g3832.t1"  
## [11] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [12] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [13] "Pocillopora_acuta_HIv2___RNAseq.g25431.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [15] "Pocillopora_acuta_HIv2___RNAseq.g9588.t1"  
## [16] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [18] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [19] "Pocillopora_acuta_HIv2___RNAseq.g16202.t1" 
## [20] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [21] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g26604.t1" 
## [24] "Pocillopora_acuta_HIv2___TS.g18100.t1"     
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [27] "Pocillopora_acuta_HIv2___RNAseq.g25327.t1" 
## [28] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g19085.t1" 
## [30] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [31] "Pocillopora_acuta_HIv2___TS.g11360.t1"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [33] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [34] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [35] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [36] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [37] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [38] "Pocillopora_acuta_HIv2___RNAseq.g21374.t1" 
## [39] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [40] "Pocillopora_acuta_HIv2___RNAseq.g5252.t1"  
## [41] "Pocillopora_acuta_HIv2___RNAseq.g22853.t1" 
## [42] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [43] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [44] "Pocillopora_acuta_HIv2___TS.g16802.t1"     
## [45] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [46] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [47] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [48] "Pocillopora_acuta_HIv2___RNAseq.g26594.t2" 
## [49] "Pocillopora_acuta_HIv2___RNAseq.g19115.t1" 
## [50] "Pocillopora_acuta_HIv2___TS.g4983.t1"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE_SwissProt")

#view most significantly differentially expressed genes by LFC

select <- order(abs(res$log2FoldChange),decreasing = TRUE)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [2] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [5] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [8] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [10] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [11] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [12] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [13] "Pocillopora_acuta_HIv2___RNAseq.g22261.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [15] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [16] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [18] "Pocillopora_acuta_HIv2___TS.g4983.t1"      
## [19] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [20] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [21] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [24] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [27] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [28] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [30] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [31] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [33] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [34] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [35] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [36] "Pocillopora_acuta_HIv2___RNAseq.g9631.t1"  
## [37] "Pocillopora_acuta_HIv2___RNAseq.g7627.t1"  
## [38] "Pocillopora_acuta_HIv2___RNAseq.g8062.t1"  
## [39] "Pocillopora_acuta_HIv2___RNAseq.g1102.t1"  
## [40] "Pocillopora_acuta_HIv2___RNAseq.g14484.t1" 
## [41] "Pocillopora_acuta_HIv2___RNAseq.g21373.t1" 
## [42] "Pocillopora_acuta_HIv2___TS.g16384.t1"     
## [43] "Pocillopora_acuta_HIv2___RNAseq.g13561.t1" 
## [44] "Pocillopora_acuta_HIv2___RNAseq.g1126.t1"  
## [45] "Pocillopora_acuta_HIv2___RNAseq.g9210.t1"  
## [46] "Pocillopora_acuta_HIv2___RNAseq.g24649.t1" 
## [47] "Pocillopora_acuta_HIv2___RNAseq.g11659.t1" 
## [48] "Pocillopora_acuta_HIv2___TS.g25577.t1a"    
## [49] "Pocillopora_acuta_HIv2___RNAseq.g27681.t1b"
## [50] "Pocillopora_acuta_HIv2___TS.g1968.t2"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_LFC_DE_SwissProt")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral_SwissProt")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi_SwissProt")

df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

DE_05_SwissProt$short_GO <- ifelse(nchar(DE_05_SwissProt$BiologicalProcess) > 30, 
                            paste0(substr(DE_05_SwissProt$BiologicalProcess, 1, 27), "..."), 
                            DE_05_SwissProt$BiologicalProcess)

gene_labels <- DE_05_SwissProt %>% select(query,short_GO) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE_Blast2GO")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral_Blast2GO")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi_Blast2GO")

Combined annotations manually

Manual <- read.csv("../output_RNA/differential_expression/DE_05_Manual_annotation.csv") %>% dplyr::rename("query" = 2, "definition" = 3) %>% arrange(X) 
df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

#view most significantly differentially expressed genes

select <- order(res$padj)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g6351.t1"  
##  [2] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [5] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [8] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [10] "Pocillopora_acuta_HIv2___RNAseq.g3832.t1"  
## [11] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [12] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [13] "Pocillopora_acuta_HIv2___RNAseq.g25431.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [15] "Pocillopora_acuta_HIv2___RNAseq.g9588.t1"  
## [16] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [18] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [19] "Pocillopora_acuta_HIv2___RNAseq.g16202.t1" 
## [20] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [21] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g26604.t1" 
## [24] "Pocillopora_acuta_HIv2___TS.g18100.t1"     
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [27] "Pocillopora_acuta_HIv2___RNAseq.g25327.t1" 
## [28] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g19085.t1" 
## [30] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [31] "Pocillopora_acuta_HIv2___TS.g11360.t1"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [33] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [34] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [35] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [36] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [37] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [38] "Pocillopora_acuta_HIv2___RNAseq.g21374.t1" 
## [39] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [40] "Pocillopora_acuta_HIv2___RNAseq.g5252.t1"  
## [41] "Pocillopora_acuta_HIv2___RNAseq.g22853.t1" 
## [42] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [43] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [44] "Pocillopora_acuta_HIv2___TS.g16802.t1"     
## [45] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [46] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [47] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [48] "Pocillopora_acuta_HIv2___RNAseq.g26594.t2" 
## [49] "Pocillopora_acuta_HIv2___RNAseq.g19115.t1" 
## [50] "Pocillopora_acuta_HIv2___TS.g4983.t1"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_DE_Manual")

#view most significantly differentially expressed genes by LFC

select <- order(abs(res$log2FoldChange),decreasing = TRUE)[1:50]
rownames(res)[select]
##  [1] "Pocillopora_acuta_HIv2___RNAseq.g28575.t1a"
##  [2] "Pocillopora_acuta_HIv2___TS.g18104.t1"     
##  [3] "Pocillopora_acuta_HIv2___RNAseq.g27038.t1" 
##  [4] "Pocillopora_acuta_HIv2___RNAseq.g14025.t1" 
##  [5] "Pocillopora_acuta_HIv2___RNAseq.g2165.t1"  
##  [6] "Pocillopora_acuta_HIv2___RNAseq.g14330.t3" 
##  [7] "Pocillopora_acuta_HIv2___RNAseq.g14090.t1" 
##  [8] "Pocillopora_acuta_HIv2___TS.g30765.t1"     
##  [9] "Pocillopora_acuta_HIv2___RNAseq.g8006.t1"  
## [10] "Pocillopora_acuta_HIv2___RNAseq.g10378.t1" 
## [11] "Pocillopora_acuta_HIv2___RNAseq.g14253.t1" 
## [12] "Pocillopora_acuta_HIv2___RNAseq.11056_t"   
## [13] "Pocillopora_acuta_HIv2___RNAseq.g22261.t1" 
## [14] "Pocillopora_acuta_HIv2___RNAseq.g14021.t1" 
## [15] "Pocillopora_acuta_HIv2___RNAseq.g19082.t1" 
## [16] "Pocillopora_acuta_HIv2___TS.g19991.t2"     
## [17] "Pocillopora_acuta_HIv2___RNAseq.g19284.t1" 
## [18] "Pocillopora_acuta_HIv2___TS.g4983.t1"      
## [19] "Pocillopora_acuta_HIv2___RNAseq.g21000.t1" 
## [20] "Pocillopora_acuta_HIv2___RNAseq.g20860.t1" 
## [21] "Pocillopora_acuta_HIv2___TS.g9414.t1"      
## [22] "Pocillopora_acuta_HIv2___RNAseq.g8588.t1"  
## [23] "Pocillopora_acuta_HIv2___RNAseq.g11588.t1" 
## [24] "Pocillopora_acuta_HIv2___RNAseq.g12281.t1" 
## [25] "Pocillopora_acuta_HIv2___TS.g27642.t1b"    
## [26] "Pocillopora_acuta_HIv2___RNAseq.g7803.t1"  
## [27] "Pocillopora_acuta_HIv2___RNAseq.g8119.t1"  
## [28] "Pocillopora_acuta_HIv2___RNAseq.g17117.t1" 
## [29] "Pocillopora_acuta_HIv2___RNAseq.g22978.t3b"
## [30] "Pocillopora_acuta_HIv2___RNAseq.30415_t"   
## [31] "Pocillopora_acuta_HIv2___TS.g16008.t2"     
## [32] "Pocillopora_acuta_HIv2___RNAseq.g2406.t1"  
## [33] "Pocillopora_acuta_HIv2___RNAseq.g25759.t1" 
## [34] "Pocillopora_acuta_HIv2___RNAseq.g24120.t1" 
## [35] "Pocillopora_acuta_HIv2___RNAseq.g14336.t1" 
## [36] "Pocillopora_acuta_HIv2___RNAseq.g9631.t1"  
## [37] "Pocillopora_acuta_HIv2___RNAseq.g7627.t1"  
## [38] "Pocillopora_acuta_HIv2___RNAseq.g8062.t1"  
## [39] "Pocillopora_acuta_HIv2___RNAseq.g1102.t1"  
## [40] "Pocillopora_acuta_HIv2___RNAseq.g14484.t1" 
## [41] "Pocillopora_acuta_HIv2___RNAseq.g21373.t1" 
## [42] "Pocillopora_acuta_HIv2___TS.g16384.t1"     
## [43] "Pocillopora_acuta_HIv2___RNAseq.g13561.t1" 
## [44] "Pocillopora_acuta_HIv2___RNAseq.g1126.t1"  
## [45] "Pocillopora_acuta_HIv2___RNAseq.g9210.t1"  
## [46] "Pocillopora_acuta_HIv2___RNAseq.g24649.t1" 
## [47] "Pocillopora_acuta_HIv2___RNAseq.g11659.t1" 
## [48] "Pocillopora_acuta_HIv2___TS.g25577.t1a"    
## [49] "Pocillopora_acuta_HIv2___RNAseq.g27681.t1b"
## [50] "Pocillopora_acuta_HIv2___TS.g1968.t2"
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/top50_LFC_DE_Manual")

#view genes Higher in Aboral, Lower in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange,decreasing = TRUE)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_Aboral <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), labels_col = NA, annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 8.5)
up_Aboral
save_ggplot(up_Aboral, "../output_RNA/differential_expression/up_Aboral_Manual")

#view genes Lower in Aboral, Higher in OralEpi, ordered by log2FoldChange
select <- order(res$log2FoldChange)[1:50]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
up_OralEpi <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=FALSE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), labels_col = NA, annotation_colors = ann_colors,
         labels_row =gene_labels[match(rownames(res)[select],(gene_labels$query)),2], fontsize_row = 8.5)
up_OralEpi
save_ggplot(up_OralEpi, "../output_RNA/differential_expression/up_OralEpi_Manual")

Expression of certain genes of interest: blast against P. acuta genome

yin yang Transctiption factor

cd ../references

#download the genome protein fasta if you have not already
wget http://cyanophora.rutgers.edu/Pocillopora_acuta/Pocillopora_acuta_HIv2.genes.pep.faa.gz

#unzip file
gunzip Pocillopora_acuta_HIv2.genes.pep.faa.gz

In unity

salloc -p cpu -c 8 --mem 32G

module load uri/main
module load BLAST+/2.15.0-gompi-2023a

#make blast_dbs directory if you haven't done so above
mkdir blast_dbs
cd blast_dbs

makeblastdb -in ../Pocillopora_acuta_HIv2.genes.pep.faa -out Pacuta_prot -dbtype prot

cd ../../output_RNA/differential_expression/

#make blast output directory if you haven't done so above
mkdir blast
cd blast

nano YinYang.txt

add accession numbers of interest:

  • nematostella vectensis transcriptional repressor protein YY1 isoforms
    • XP_048585772.1
    • XP_048585773.1
    • XP_048585774.1
  • human transcription factor YY2
    • NP_996806.2
  • human transcription factor ZFP41 (REX-1)
    • NP_777560.2
XP_048585772.1
XP_048585773.1
XP_048585774.1
NP_996806.2
NP_777560.2
# Read the input file line by line and fetch FASTA sequences
while read -r accession; do
  if [[ -n "$accession" ]]; then
    echo "Fetching $accession..."
    curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=${accession}&rettype=fasta&retmode=text" >> "YinYang.fasta"
    echo >> "YinYang.fasta"  # Add a newline between sequences
    sleep 1  # Avoid hitting rate limits
  fi
done < "YinYang.txt"

# run blast with human-readable output
blastp -query YinYang.fasta -db ../../../references/blast_dbs/Pacuta_prot -out YinYang_blast_results.txt -outfmt 0

#looks like there are a lot of matches for each gene! I am going to do a tab search with a very low e-value cutoff:

# run blast with tabular output
blastp -query YinYang.fasta -db ../../../references/blast_dbs/Pacuta_prot -out YinYang_blast_results_tab.txt -outfmt 6 -evalue 1e-25

Great! Interestingly, the same gene, Pocillopora_acuta_HIv2_RNAseq.g25242.t1 is the top match for all 5 proteins I searched. Pocillopora_acuta_HIv2_TS.g24434.t1 is the second best match for all 5. That is interesting! Pocillopora_acuta_HIv2_RNAseq.g7583.t1 is also worth looking at. And Pocillopora_acuta_HIv2_TS.g21338.t1 matched all three YY1 isoforms.

YinYangs <- c("Pocillopora_acuta_HIv2___RNAseq.g25242.t1",
              "Pocillopora_acuta_HIv2___TS.g24434.t1",
              "Pocillopora_acuta_HIv2___RNAseq.g7583.t1",
              "Pocillopora_acuta_HIv2___TS.g21338.t1")

for (i in 1:length(YinYangs)){
  plotCounts(dds, gene=YinYangs[i], intgroup=c("Tissue"),)
}

as.data.frame(resOrdered)[YinYangs,]
##                                             baseMean log2FoldChange     lfcSE
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 2388.43282    -0.65267532 0.8125595
## Pocillopora_acuta_HIv2___TS.g24434.t1       86.14232    -0.16148954 1.0080043
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1   429.08727     0.06121494 0.9629085
## Pocillopora_acuta_HIv2___TS.g21338.t1      277.12637    -0.03463576 1.0095974
##                                              pvalue      padj
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 0.1695296 0.3778825
## Pocillopora_acuta_HIv2___TS.g24434.t1     0.2933826 0.5402220
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1  0.8212358 0.9266210
## Pocillopora_acuta_HIv2___TS.g21338.t1     0.2453329 0.4838418
##                                                                               query
## Pocillopora_acuta_HIv2___RNAseq.g25242.t1 Pocillopora_acuta_HIv2___RNAseq.g25242.t1
## Pocillopora_acuta_HIv2___TS.g24434.t1         Pocillopora_acuta_HIv2___TS.g24434.t1
## Pocillopora_acuta_HIv2___RNAseq.g7583.t1   Pocillopora_acuta_HIv2___RNAseq.g7583.t1
## Pocillopora_acuta_HIv2___TS.g21338.t1         Pocillopora_acuta_HIv2___TS.g21338.t1

Okay! None of these genes are differentially expressed between the tissues. That is interesting and good to know. Pocillopora_acuta_HIv2___RNAseq.g25242.t1 has the highest basal expression of all the potential isoforms of this transcription factor.

TRP Channels

df <- as.data.frame(colData(dds)[,c("Tissue","Fragment")])

DE_05_SwissProt$short_name <- ifelse(nchar(DE_05_SwissProt$ProteinNames) > 80, 
                            paste0(substr(DE_05_SwissProt$ProteinNames, 1, 77), "..."), 
                            DE_05_SwissProt$ProteinNames)

gene_labels <- DE_05_SwissProt %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

TRP <- Manual %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE)) 
select <- TRP$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 12)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/TRP_DE_SwissProt", width = 6.43, height = 4.25, units = "in", dpi = 300)

annot_tab$short_name <- ifelse(nchar(annot_tab$ProteinNames) > 80, 
                            paste0(substr(annot_tab$ProteinNames, 1, 77), "..."), 
                            annot_tab$ProteinNames)

gene_labels <- annot_tab  %>% 
  select(query,short_name) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

TRP <- annot_tab %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE))
select1 <- TRP$query
select <- match(select1,rownames(vsd))
select <- select[!is.na(select)]

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select1,gene_labels$query),2], fontsize_row = 6)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/TRP_SwissProt")

biomin de heatmap nice

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes


DE_05_biomin_filtered <- DE_05_biomin %>% left_join(Manual,by="query" ) %>% filter(!(Heatmap_Label %in% c("Myosin-9", "Actin, cytoplasmic")))
  
select <- DE_05_biomin_filtered$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_biomin <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
DE_biomin
save_ggplot(DE_biomin, "../output_RNA/differential_expression/DE_biomin")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k as needed

# Create a dataframe to manage clusters and reordering
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate heatmap with reordered rows and labels
DE_biomin <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  show_rownames = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8
)
save_ggplot(DE_biomin, "../output_RNA/differential_expression/clusters_clean/DE_biomin")

#############################
gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

DE_05_Biomin_broc_filtered <- DE_05_Biomin_broc %>% left_join(Manual,by="query" ) %>% filter(!(Heatmap_Label %in% c("Myosin-9", "Actin, cytoplasmic")))
  
select <- DE_05_Biomin_broc_filtered$query
z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
DE_Biomin_broc <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
DE_Biomin_broc
save_ggplot(DE_Biomin_broc, "../output_RNA/differential_expression/DE_Biomin_broc")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k as needed

# Create a dataframe to manage clusters and reordering
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate heatmap with reordered rows and labels
DE_Biomin_broc <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  show_rownames = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8
)
save_ggplot(DE_Biomin_broc, "../output_RNA/differential_expression/clusters_clean/DE_Biomin_broc")

###wnt and hox

WNT <- Manual %>% filter(grepl("wnt", Heatmap_Label, ignore.case = TRUE)|grepl("frizzle", Heatmap_Label, ignore.case = TRUE)|grepl("homeobox", Heatmap_Label, ignore.case = TRUE)|grepl("hox", Heatmap_Label, ignore.case = TRUE)|grepl("forkhead", Heatmap_Label, ignore.case = TRUE))#|grepl("wnt", BiologicalProcess, ignore.case = TRUE))
select <- WNT$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/WNT_Hox_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  cutree_rows = 2,  # Keep row clustering if desired
  gaps_row = 24  # Add breaks only between clusters
)


# Save the heatmap
save_ggplot(top50_DE, "../output_RNA/differential_expression/clusters_clean/WNT_Hox_DE_SwissProt")

Hox genes Nematostella

differentially expressed

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

select <- DE_05_Hox$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/Hox_Nvec_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  cutree_rows = 2,  # Keep row clustering if desired
  gaps_row = 4  # Add breaks only between clusters
)


# Save the heatmap
save_ggplot(top50_DE, "../output_RNA/differential_expression/clusters_clean/Hox_Nvec_DE_SwissProt")

all

labels <- Hox_all_res %>% drop_na() %>% pull(def_short)

select <- Hox_all_res %>% drop_na() %>% pull(query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = labels, fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/Hox_Nvec_all_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = labels,
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  cutree_rows = 2,  # Keep row clustering if desired
  gaps_row = 9  # Add breaks only between clusters
)


# Save the heatmap
save_ggplot(top50_DE, "../output_RNA/differential_expression/clusters_clean/Hox_Nvec_all_SwissProt")

He et al 2023 TFs/genes of interest Nematostella

differentially expressed

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

select <- unique(DE_05_He_etal$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/He_etal_Nvec_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  cutree_rows = 2,  # Keep row clustering if desired
  gaps_row = 4  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(top50_DE, "../output_RNA/differential_expression/clusters_clean/He_etal_Nvec_DE_SwissProt")

### DuBuc et al 2018 Wnt/Aboral-Oral Patterning Genes, interest Nematostella

differentially expressed

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

select <- unique(DE_05_DuBuc_etal$query)

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/DuBuc_etal_Nvec_DE_SwissProt")

# Perform clustering
# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Ensure there is only 1 gap between clusters
#row_breaks <- which(cluster_assignments != cluster_assignments[1])  # Only where cluster switches

# Generate the heatmap with reordered rows and labels
top50_DE <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white"#,  # Optional border around clusters
  #cutree_rows = 2,  # Keep row clustering if desired
  #gaps_row = 4  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(top50_DE, "../output_RNA/differential_expression/clusters_clean/DuBuc_etal_Nvec_DE_SwissProt")

bacteria genes

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes


mucin <- Manual %>% filter(grepl("mucin", ProteinNames, ignore.case = TRUE)|grepl("toll-", ProteinNames, ignore.case = TRUE)|grepl("ZP", ProteinNames, ignore.case = TRUE)|grepl("lectin", ProteinNames, ignore.case = TRUE)|grepl("nitric", Heatmap_Label, ignore.case = TRUE)) %>% filter(Heatmap_Label !="Cnidocyte marker protein (Collectin-11)")
select <- mucin$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 2)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/bacteria_DE_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 2) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
mucins <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  cutree_rows = 2,  # Keep row clustering if desired
  gaps_row = c(24,38)  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(mucins, "../output_RNA/differential_expression/clusters_clean/bacteria_DE_SwissProt")

temp and light sensing

Manual <- read.csv("../output_RNA/differential_expression/DE_05_Manual_annotation.csv") %>% dplyr::rename("query" = 2, "definition" = 3) %>% arrange(X) 

gene_labels <- Manual %>% 
  select(query,Heatmap_Label) %>%
  mutate_all(~ ifelse(is.na(.), "", .)) #replace NAs with "" for labelling purposes

sensors <- Manual %>% filter(grepl("TRP", Heatmap_Label, ignore.case = TRUE)|grepl("cellular response to light stimulus", BiologicalProcess, ignore.case = TRUE)|grepl("detection of mechanical stimulus involved in sensory perception", BiologicalProcess, ignore.case = TRUE))


select <- sensors$query

z_scores <- t(scale(t(assay(vsd)[select, ]), center = TRUE, scale = TRUE))
top50_DE <- pheatmap(z_scores, color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200), cluster_rows=TRUE, show_rownames=TRUE,
         cluster_cols=TRUE, cutree_cols = 2,annotation_col=(df%>% select(Tissue)), annotation_colors = ann_colors,
         labels_row = gene_labels[match(select,(gene_labels$query)),2], fontsize_row = 8, cutree_rows = 3)
top50_DE
save_ggplot(top50_DE, "../output_RNA/differential_expression/sensors_DE_SwissProt")

# Perform clustering
row_clusters <- hclust(dist(z_scores))
cluster_assignments <- cutree(row_clusters, k = 3) # Adjust k for the number of clusters

# Create a dataframe for clustering and label management
clustered_data <- data.frame(
  query = select,
  Heatmap_Label = gene_labels[match(select, gene_labels$query), 2],
  cluster = cluster_assignments
)

# Reorder rows within each cluster alphabetically by their labels
clustered_data <- clustered_data %>%
  mutate(cluster = factor(cluster, levels = c(1, 3, 2))) %>%
  arrange(cluster, Heatmap_Label)

# Reorder the z_scores matrix and labels based on the new order
z_scores <- z_scores[match(clustered_data$query, rownames(z_scores)), ]
ordered_labels <- clustered_data$Heatmap_Label

# Generate the heatmap with reordered rows and labels
transporters_heat <- pheatmap(
  z_scores,
  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
  cluster_rows = FALSE,  # Disable clustering since rows are pre-ordered
  cluster_cols = TRUE,
  cutree_cols = 2,
  annotation_col = (df %>% select(Tissue)),
  annotation_colors = ann_colors,
  labels_row = ordered_labels,
  fontsize_row = 8,
  borders_color = "white",  # Optional border around clusters
  cutree_rows = 2,  # Keep row clustering if desired
  gaps_row = c(10,18)  # Add breaks only between clusters
)

# Save the heatmap
save_ggplot(transporters_heat, "../output_RNA/differential_expression/clusters_clean/sensors_DE_SwissProt")

random

# Create annotation data frame
df <- as.data.frame(colData(dds)[, c("Tissue", "Fragment")])

# Function to generate short names for proteins
generate_short_name <- function(data) {
  data %>%
    mutate(short_name = ifelse(nchar(ProteinNames) > 60, 
                               paste0(substr(ProteinNames, 1, 57), "..."), 
                               ProteinNames))
}

# Function to create gene labels
create_gene_labels <- function(data) {
  data %>%
    select(query, short_name) %>%
    mutate_all(~ ifelse(is.na(.), "", .)) # Replace NAs with ""
}

# Function to generate z-scores for selected genes
calculate_z_scores <- function(data, selection, vsd_matrix) {
  selected_genes <- match(selection, rownames(vsd_matrix))
  selected_genes <- selected_genes[!is.na(selected_genes)] # Remove NAs
  t(scale(t(assay(vsd_matrix)[selected_genes, ]), center = TRUE, scale = TRUE))
}

# Function to create and save heatmap
create_heatmap <- function(z_scores, labels_row, annotation_col, annotation_colors, output_path) {
  heatmap <- pheatmap(z_scores, 
                      color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
                      cluster_rows = TRUE, 
                      show_rownames = TRUE,
                      cluster_cols = TRUE, 
                      cutree_cols = 2,
                      annotation_col = annotation_col,
                      annotation_colors = annotation_colors,
                      labels_row = labels_row, 
                      fontsize_row = 6)
  save_ggplot(heatmap, output_path)
}

# Process DE_05_SwissProt
DE_05_SwissProt <- generate_short_name(DE_05_SwissProt)
gene_labels <- create_gene_labels(DE_05_SwissProt)
TRP <- DE_05_SwissProt %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE))
select <- TRP$query
z_scores <- calculate_z_scores(DE_05_SwissProt, select, vsd)
create_heatmap(z_scores, 
               labels_row = gene_labels[match(select, gene_labels$query), 2], 
               annotation_col = df %>% select(Tissue), 
               annotation_colors = ann_colors, 
               output_path = "../output_RNA/differential_expression/TRP_DE_SwissProt")

# Process annot_tab
annot_tab <- generate_short_name(annot_tab)
gene_labels <- create_gene_labels(annot_tab)
TRP <- annot_tab %>% filter(grepl("transient", ProteinNames, ignore.case = TRUE))
TRP <- left_join(TRP, as.data.frame(resOrdered)) %>% filter(!is.na(log2FoldChange))
## Joining with `by = join_by(query)`
select1 <- TRP$query
z_scores <- calculate_z_scores(annot_tab, select1, vsd)
create_heatmap(z_scores, 
               labels_row = TRP$short_name, 
               annotation_col = df %>% select(Tissue), 
               annotation_colors = ann_colors, 
               output_path = "../output_RNA/differential_expression/TRP_SwissProt")

significant <- ifelse(row.names(z_scores) %in% DE_05$query, "Significant", "Not Significant")

# Add this as a new annotation
row_annotation <- data.frame(Significance = significant)
rownames(row_annotation) <- rownames(z_scores)
row_annotation_colors <- list(Significance = c("Significant" = "red", "Not Significant" = "grey"))

heatmap <- pheatmap(z_scores,  color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdBu")))(200),
                      cluster_rows = TRUE, 
                      show_rownames = TRUE,
                      cluster_cols = TRUE, 
                      cutree_cols = 2,
                    cutree_rows = 5,
                      annotation_col = df %>% select(Tissue),
                      annotation_colors = ann_colors,
                    annotation_row = row_annotation,
          annotation_row_colors = row_annotation_colors,
                      labels_row = TRP$short_name, 
                      fontsize_row = 12)
  save_ggplot(heatmap, "../output_RNA/differential_expression/TRP_SwissProt")

# Process DE_05_SwissProt
DE_05_SwissProt <- generate_short_name(DE_05_SwissProt)
gene_labels <- create_gene_labels(DE_05_SwissProt)
GFP <- DE_05_SwissProt %>% filter(grepl("biolum", BiologicalProcess, ignore.case = TRUE))
select <- GFP$query
z_scores <- calculate_z_scores(DE_05_SwissProt, select, vsd)
create_heatmap(z_scores, 
               labels_row = gene_labels[match(select, gene_labels$query), 2], 
               annotation_col = df %>% select(Tissue), 
               annotation_colors = ann_colors, 
               output_path = "../output_RNA/differential_expression/GFP_DE_SwissProt")

# Process annot_tab
annot_tab <- generate_short_name(annot_tab)
gene_labels <- create_gene_labels(annot_tab)
GFP <- annot_tab %>% filter(grepl("biolum", BiologicalProcess, ignore.case = TRUE))
select1 <- GFP$query
z_scores <- calculate_z_scores(annot_tab, select1, vsd)
create_heatmap(z_scores, 
               labels_row = gene_labels[match(select1, gene_labels$query), 2], 
               annotation_col = df %>% select(Tissue), 
               annotation_colors = ann_colors, 
               output_path = "../output_RNA/differential_expression/GFP_SwissProt")

Aboral Oral Larvae Paper - Pdam genome

Join our data to Pdam annotations

Annot_Pdam <- read.csv("../references/annotation/blastp_Pdam_out.tab", sep = '\t', header = FALSE) %>% select(c(1,2)) %>% dplyr::rename("protein_id" = "V2", "query" = "V1")
Manual_Pdam <- left_join(Manual, Annot_Pdam) 
## Joining with `by = join_by(query)`
library(readxl)
larval_aboral_enriched <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval.xlsx", sheet = "pocillopora_aboral_enriched_05_")

Manual_Pdam_upaboral <- Manual_Pdam %>% filter(log2FoldChange > 0)
inner_join(larval_aboral_enriched, Manual_Pdam_upaboral, by=c("gene ID"="protein_id")) %>% dim()
## [1] 18 26
#18 genes overlap from the 126 in the paper

larval_oral_enriched <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval.xlsx", sheet = "pocillopora_oral_enriched_05_lf")

Manual_Pdam_uporal <- Manual_Pdam %>% filter(log2FoldChange < 0)
inner_join(larval_oral_enriched, Manual_Pdam_uporal, by=c("gene ID"="protein_id")) %>% dim()
## [1]  6 26
inner_join(larval_oral_enriched, Manual_Pdam_uporal, by=c("gene ID"="protein_id")) %>% View()

#only 6/83 genes overlap


larval_aboral_clusters <- read_excel("../output_RNA/marker_genes/RamonMateu_Pdam_larval_scRNA.xlsx") %>% filter(!is.na(pocillopora))
## New names:
## • `` -> `...8`
## • `` -> `...9`
## • `` -> `...10`
## • `` -> `...11`
larval_aboral_clusters_Pacuta <- inner_join(larval_aboral_clusters, Manual_Pdam, by=c("pocillopora"="protein_id"))

#my genes upregulated in oral tissue are high in mucous cells, cool

Updating Renv environment:

After you’ve confirmed your code works as expected, use renv::snapshot() to record the packages and their sources in the lockfile.

renv::snapshot()